In [ ]:
%run "../Functions/2. Google form analysis.ipynb"
In [ ]:
# Localplayerguids of users who answered the questionnaire (see below).
# French
#localplayerguid = 'a4d4b030-9117-4331-ba48-90dc05a7e65a'
#localplayerguid = 'd6826fd9-a6fc-4046-b974-68e50576183f'
#localplayerguid = 'deb089c0-9be3-4b75-9b27-28963c77b10c'
#localplayerguid = '75e264d6-af94-4975-bb18-50cac09894c4'
#localplayerguid = '3d733347-0313-441a-b77c-3e4046042a53'
# English
localplayerguid = '8d352896-a3f1-471c-8439-0f426df901c1'
#localplayerguid = '7037c5b2-c286-498e-9784-9a061c778609'
#localplayerguid = '5c4939b5-425b-4d19-b5d2-0384a515539e'
#localplayerguid = '7825d421-d668-4481-898a-46b51efe40f0'
#localplayerguid = 'acb9c989-b4a6-4c4d-81cc-6b5783ec71d8'
#localplayerguid = devPCID5
In [ ]:
len(getAllResponders())
In [ ]:
userIdThatDidNotAnswer in gform['userId'].values, hasAnswered( userIdThatDidNotAnswer )
In [ ]:
assert(not hasAnswered( userIdThatDidNotAnswer )), "User has NOT answered"
In [ ]:
assert(hasAnswered( userId1AnswerEN )), "User HAS answered"
In [ ]:
assert(hasAnswered( userIdAnswersEN )), "User HAS answered"
In [ ]:
assert(hasAnswered( userId1AnswerFR )), "User HAS answered"
In [ ]:
assert(hasAnswered( userIdAnswersFR )), "User HAS answered"
In [ ]:
assert(hasAnswered( userIdAnswersENFR )), "User HAS answered"
In [ ]:
assert (len(getAnswers( userIdThatDidNotAnswer ).columns) == 0),"Too many answers"
In [ ]:
assert (len(getAnswers( userId1AnswerEN ).columns) == 1),"Too many answers"
In [ ]:
assert (len(getAnswers( userIdAnswersEN ).columns) >= 2),"Not enough answers"
In [ ]:
assert (len(getAnswers( userId1AnswerFR ).columns) == 1),"Not enough columns"
In [ ]:
assert (len(getAnswers( userIdAnswersFR ).columns) >= 2),"Not enough answers"
In [ ]:
assert (len(getAnswers( userIdAnswersENFR ).columns) >= 2),"Not enough answers"
In [ ]:
assert (len(getCorrections( userIdThatDidNotAnswer ).columns) == 0),"Too many answers"
In [ ]:
assert (len(getCorrections( userId1AnswerEN ).columns) == 2),"Too many answers"
In [ ]:
assert (len(getCorrections( userIdAnswersEN ).columns) >= 4),"Not enough answers"
In [ ]:
assert (len(getCorrections( userId1AnswerFR ).columns) == 2),"Too many answers"
In [ ]:
assert (len(getCorrections( userIdAnswersFR ).columns) >= 4),"Not enough answers"
In [ ]:
assert (len(getCorrections( userIdAnswersENFR ).columns) >= 4),"Not enough answers"
In [ ]:
In [ ]:
assert (len(pd.DataFrame(getScore( userIdThatDidNotAnswer ).values.flatten().tolist()).values.flatten().tolist()) == 0),"Too many answers"
In [ ]:
score = getScore( userId1AnswerEN )
#print(score)
assert (
(len(score.values.flatten()) == 3)
and
score[answerTemporalities[0]][0][0] == 0
),"Incorrect score"
for userId in gform['userId'].values: score = getScore( userId ) pretestScore = score[answerTemporalities[0]][0] posttestScore = score[answerTemporalities[1]][0] if len(pretestScore) == 1 and len(posttestScore) == 0 and 0 != pretestScore[0]:
#gform[gform['userId']]
print(userId + ': ' + str(pretestScore[0]))
In [ ]:
score = getScore( userIdAnswersEN )
#print(score)
assert (
(len(score.values.flatten()) == 3)
and
score[answerTemporalities[0]][0][0] == 5
and
score[answerTemporalities[1]][0][0] == 25
),"Incorrect score"
In [ ]:
score = getScore( userId1AnswerFR )
#print(score)
assert (
(len(score.values.flatten()) == 3)
and
score[answerTemporalities[0]][0][0] == 23
),"Incorrect score"
In [ ]:
score = getScore( userIdAnswersFR )
#print(score)
assert (
(len(score.values.flatten()) == 3)
and
score[answerTemporalities[0]][0][0] == 15
and
score[answerTemporalities[1]][0][0] == 26
),"Incorrect score"
In [ ]:
score = getScore( userIdAnswersENFR )
#print(score)
assert (
(len(score.values.flatten()) == 3)
and
score[answerTemporalities[0]][0][0] == 4
and
score[answerTemporalities[1]][0][0] == 13
),"Incorrect score"
In [ ]:
objective = 0
assert (len(getValidatedCheckpoints( userIdThatDidNotAnswer )) == objective),"Incorrect number of answers"
In [ ]:
objective = 1
assert (len(getValidatedCheckpoints( userId1AnswerEN )) == objective),"Incorrect number of answers"
In [ ]:
assert (getValidatedCheckpoints( userId1AnswerEN )[0].equals(validableCheckpoints)) \
, "User has validated everything"
In [ ]:
objective = 2
assert (len(getValidatedCheckpoints( userIdAnswersEN )) == objective),"Incorrect number of answers"
In [ ]:
objective = 3
assert (len(getValidatedCheckpoints( userIdAnswersEN )[0]) == objective) \
, "User has validated " + objective + " chapters on first try"
In [ ]:
objective = 1
assert (len(getValidatedCheckpoints( userId1AnswerFR )) == objective),"Incorrect number of answers"
In [ ]:
assert (getValidatedCheckpoints( userId1AnswerFR )[0].equals(validableCheckpoints)) \
, "User has validated everything"
In [ ]:
objective = 2
assert (len(getValidatedCheckpoints( userIdAnswersFR )) == objective),"Incorrect number of answers"
In [ ]:
objective = 5
assert (len(getValidatedCheckpoints( userIdAnswersFR )[1]) == objective) \
, "User has validated " + objective + " chapters on second try"
In [ ]:
objective = 2
assert (len(getValidatedCheckpoints( userIdAnswersENFR )) == objective),"Incorrect number of answers"
In [ ]:
objective = 5
assert (len(getValidatedCheckpoints( userIdAnswersENFR )[1]) == objective) \
, "User has validated " + objective + " chapters on second try"
In [ ]:
getValidatedCheckpoints( userIdThatDidNotAnswer )
In [ ]:
pd.Series(getValidatedCheckpoints( userIdThatDidNotAnswer ))
In [ ]:
type(getNonValidated(pd.Series(getValidatedCheckpoints( userIdThatDidNotAnswer ))))
In [ ]:
validableCheckpoints
In [ ]:
assert(getNonValidated(getValidatedCheckpoints( userIdThatDidNotAnswer ))).equals(validableCheckpoints), \
"incorrect validated checkpoints: should contain all checkpoints that can be validated"
In [ ]:
testSeries = pd.Series(
[
'', # 7
'', # 8
'', # 9
'', # 10
'tutorial1.Checkpoint00', # 11
'tutorial1.Checkpoint00', # 12
'tutorial1.Checkpoint00', # 13
'tutorial1.Checkpoint00', # 14
'tutorial1.Checkpoint02', # 15
'tutorial1.Checkpoint01', # 16
'tutorial1.Checkpoint05'
]
)
assert(getNonValidated(pd.Series([testSeries]))[0][0] == 'tutorial1.Checkpoint13'), "Incorrect non validated checkpoint"
In [ ]:
getNonValidatedCheckpoints( userIdThatDidNotAnswer )
In [ ]:
getNonValidatedCheckpoints( userId1AnswerEN )
In [ ]:
getNonValidatedCheckpoints( userIdAnswersEN )
In [ ]:
getNonValidatedCheckpoints( userId1AnswerFR )
In [ ]:
getNonValidatedCheckpoints( userIdAnswersFR )
In [ ]:
getNonValidatedCheckpoints( userIdAnswersENFR )
In [ ]:
getValidatedCheckpointsCounts(userIdThatDidNotAnswer)
getValidatedCheckpointsCounts(userId1AnswerEN)
getValidatedCheckpointsCounts(userIdAnswersEN)
getValidatedCheckpointsCounts(userId1ScoreEN)
getValidatedCheckpointsCounts(userIdScoresEN)
getValidatedCheckpointsCounts(userId1AnswerFR)
getValidatedCheckpointsCounts(userIdAnswersFR)
getValidatedCheckpointsCounts(userId1ScoreFR)
getValidatedCheckpointsCounts(userIdScoresFR)
getValidatedCheckpointsCounts(userIdAnswersENFR)
In [ ]:
getNonValidatedCheckpointsCounts(userIdThatDidNotAnswer)
getNonValidatedCheckpointsCounts(userId1AnswerEN)
getNonValidatedCheckpointsCounts(userIdAnswersEN)
getNonValidatedCheckpointsCounts(userId1ScoreEN)
getNonValidatedCheckpointsCounts(userIdScoresEN)
getNonValidatedCheckpointsCounts(userId1AnswerFR)
getNonValidatedCheckpointsCounts(userIdAnswersFR)
getNonValidatedCheckpointsCounts(userId1ScoreFR)
getNonValidatedCheckpointsCounts(userIdScoresFR)
getNonValidatedCheckpointsCounts(userIdAnswersENFR)
In [ ]:
aYes = ["Yes", "Oui"]
aNo = ["No", "Non"]
aNoIDK = ["No", "Non", "I don't know", "Je ne sais pas"]
# How long have you studied biology?
qBiologyEducationLevelIndex = 5
aBiologyEducationLevelHigh = ["Until bachelor's degree", "Jusqu'à la license"]
aBiologyEducationLevelLow = ['Until the end of high school', 'Until the end of middle school', 'Not even in middle school'\
"Jusqu'au bac", "Jusqu'au brevet", 'Jamais']
# Have you ever heard about BioBricks?
qHeardBioBricksIndex = 8
# Have you played the current version of Hero.Coli?
qPlayedHerocoliIndex = 10
qPlayedHerocoliYes = ['Yes', 'Once', 'Multiple times', 'Oui',
'De nombreuses fois', 'Quelques fois', 'Une fois']
qPlayedHerocoliNo = ['No', 'Non',]
In [ ]:
gform[QStudiedBiology].unique()
In [ ]:
gform['Before playing Hero.Coli, had you ever heard about BioBricks?'].unique()
In [ ]:
gform['Have you played the current version of Hero.Coli?'].unique()
In [ ]:
getAllAnswerRows(qBiologyEducationLevelIndex, aBiologyEducationLevelHigh)
In [ ]:
assert(len(getAllAnswerRows(qBiologyEducationLevelIndex, aBiologyEducationLevelHigh)) != 0)
In [ ]:
assert(len(getAllAnswerRows(qBiologyEducationLevelIndex, aBiologyEducationLevelLow)) != 0)
In [ ]:
assert(len(getAllAnswerRows(qHeardBioBricksIndex, aYes)) != 0)
In [ ]:
assert(len(getAllAnswerRows(qHeardBioBricksIndex, aNoIDK)) != 0)
In [ ]:
assert(len(getAllAnswerRows(qPlayedHerocoliIndex, qPlayedHerocoliYes)) != 0)
In [ ]:
assert(len(getAllAnswerRows(qPlayedHerocoliIndex, qPlayedHerocoliNo)) != 0)
In [ ]:
questionIndex = 15
gform.iloc[:, questionIndex].head()
In [ ]:
(qBiologyEducationLevelIndex, aBiologyEducationLevelHigh)
In [ ]:
getAllAnswerRows(qBiologyEducationLevelIndex, aBiologyEducationLevelHigh)
In [ ]:
getPercentCorrectKnowingAnswer(qBiologyEducationLevelIndex, aBiologyEducationLevelHigh)
In [ ]:
getPercentCorrectKnowingAnswer(qBiologyEducationLevelIndex, aBiologyEducationLevelLow)
In [ ]:
getPercentCorrectKnowingAnswer(qHeardBioBricksIndex, aYes)
In [ ]:
getPercentCorrectKnowingAnswer(qHeardBioBricksIndex, aNoIDK)
In [ ]:
playedHerocoliIndexYes = getPercentCorrectKnowingAnswer(qPlayedHerocoliIndex, qPlayedHerocoliYes)
playedHerocoliIndexYes
In [ ]:
playedHerocoliIndexNo = getPercentCorrectKnowingAnswer(qPlayedHerocoliIndex, qPlayedHerocoliNo)
playedHerocoliIndexNo
In [ ]:
playedHerocoliIndexYes - playedHerocoliIndexNo
In [ ]:
(playedHerocoliIndexYes - playedHerocoliIndexNo) / (1 - playedHerocoliIndexNo)
In [ ]:
#gform = gformEN
In [ ]:
transposed = gform.T
#answers = transposed[transposed[]]
transposed
In [ ]:
type(gform)
In [ ]:
gform.columns
In [ ]:
gform.columns.get_loc('Do not edit - pre-filled anonymous ID')
In [ ]:
localplayerguidkey
In [ ]:
# Using the whole question:
gform[localplayerguidkey]
In [ ]:
# Get index from question
localplayerguidindex
In [ ]:
# Using the index of the question:
gform.iloc[:, localplayerguidindex]
userIdThatDidNotAnswer
userId1AnswerEN
userIdAnswersEN
userId1AnswerFR
userIdAnswersFR
userIdAnswersENFR
In [ ]:
sample = gform
#def getUniqueUserCount(sample):
sample[localplayerguidkey].nunique()
In [ ]:
userIds = gform[localplayerguidkey].unique()
len(userIds)
In [ ]:
allResponders = getAllResponders()
uniqueUsers = np.unique(allResponders)
print(len(allResponders))
print(len(uniqueUsers))
for guid in uniqueUsers:
if(not isGUIDFormat(guid)):
print('incorrect guid: ' + str(guid))
In [ ]:
uniqueUsers = getAllResponders()
userCount = len(uniqueUsers)
guid = '0'
while (not isGUIDFormat(guid)):
userIndex = randint(0,userCount-1)
guid = uniqueUsers[userIndex]
guid
In [ ]:
#userId = userIdThatDidNotAnswer
#userId = userId1AnswerEN
userId = userIdAnswersEN
_form = gform
#def getAnswers( userId, _form = gform ):
answers = _form[_form[localplayerguidkey]==userId]
_columnAnswers = answers.T
if 0 != len(answers):
_newColumns = []
for column in _columnAnswers.columns:
_newColumns.append(answersColumnNameStem + str(column))
_columnAnswers.columns = _newColumns
else:
# user has never answered
print("user " + str(userId) + " has never answered")
_columnAnswers
In [ ]:
answers
In [ ]:
# Selection of a specific answer
answers.iloc[:,localplayerguidindex]
In [ ]:
answers.iloc[:,localplayerguidindex].iloc[0]
In [ ]:
type(answers.iloc[0,:])
In [ ]:
answers.iloc[0,:].values
In [ ]:
#### Question that has a correct answer:
In [ ]:
questionIndex = 15
In [ ]:
answers.iloc[:,questionIndex].iloc[0]
In [ ]:
correctAnswers.iloc[questionIndex][0]
In [ ]:
answers.iloc[:,questionIndex].iloc[0].startswith(correctAnswers.iloc[questionIndex][0])
In [ ]:
#### Question that has no correct answer:
In [ ]:
questionIndex = 0
#answers.iloc[:,questionIndex].iloc[0].startswith(correctAnswers.iloc[questionIndex].iloc[0])
In [ ]:
#### Batch check:
In [ ]:
columnAnswers = getAnswers( userId )
In [ ]:
columnAnswers.values[2,0]
In [ ]:
columnAnswers[columnAnswers.columns[0]][2]
In [ ]:
correctAnswers
In [ ]:
type(columnAnswers)
In [ ]:
indexOfFirstEvaluationQuestion = 13
columnAnswers.index[indexOfFirstEvaluationQuestion]
In [ ]:
gform.tail(50)
In [ ]:
gform[gform[localplayerguidkey] == 'ba202bbc-af77-42e8-85ff-e25b871717d5']
In [ ]:
gformRealBefore = gform.loc[88, QTimestamp]
gformRealBefore
In [ ]:
gformRealAfter = gform.loc[107, QTimestamp]
gformRealAfter
In [ ]:
RMRealFirstEvent = getFirstEventDate(gform.loc[88,localplayerguidkey])
RMRealFirstEvent
In [ ]:
tzAnswerDate = gformRealBefore
gameEventDate = RMRealFirstEvent
#def getTemporality( answerDate, gameEventDate ):
result = answerTemporalities[2]
if(gameEventDate != pd.Timestamp.max.tz_localize('utc')):
if(answerDate <= gameEventDate):
result = answerTemporalities[0]
elif (answerDate > gameEventDate):
result = answerTemporalities[1]
result, tzAnswerDate, gameEventDate
In [ ]:
In [ ]:
firstEventDate = getFirstEventDate(gform.loc[userIndex,localplayerguidkey])
firstEventDate
In [ ]:
gformTestBefore = pd.Timestamp('2018-01-16 14:28:20.998000+0000', tz='UTC')
getTemporality(gformTestBefore,firstEventDate)
In [ ]:
gformTestWhile = pd.Timestamp('2018-01-16 14:28:23.998000+0000', tz='UTC')
getTemporality(gformTestWhile,firstEventDate)
In [ ]:
gformTestAfter = pd.Timestamp('2018-01-16 14:28:24.998000+0000', tz='UTC')
getTemporality(gformTestAfter,firstEventDate)
In [ ]:
_form = gform
_rmDF = rmdf1522
_rmTestDF = normalizedRMDFTest
includeAndroid = True
#def getTestAnswers( _form = gform, _rmDF = rmdf1522, _rmTestDF = normalizedRMDFTest, includeAndroid = True):
_form[_form[localplayerguidkey].isin(testUsers)]
In [ ]:
_form[localplayerguidkey]
In [ ]:
testUsers
In [ ]:
len(getTestAnswers()[localplayerguidkey])
In [ ]:
rmdf1522['customData.platform'].unique()
In [ ]:
rmdf1522[rmdf1522['customData.platform'].apply(lambda s: str(s).endswith('editor'))]
In [ ]:
rmdf1522[rmdf1522['userId'].isin(getTestAnswers()[localplayerguidkey])][['userTime','customData.platform','userId']].dropna()
In [ ]:
columnAnswers
In [ ]:
#testUserId = userId1AnswerEN
testUserId = '8d352896-a3f1-471c-8439-0f426df901c1'
In [ ]:
getCorrections(testUserId)
In [ ]:
testUserId = '8d352896-a3f1-471c-8439-0f426df901c1'
source = correctAnswers
#def getCorrections( _userId, _source = correctAnswers, _form = gform ):
columnAnswers = getAnswers( testUserId )
if 0 != len(columnAnswers.columns):
questionsCount = len(columnAnswers.values)
for columnName in columnAnswers.columns:
if answersColumnNameStem in columnName:
answerNumber = columnName.replace(answersColumnNameStem,"")
newCorrectionsColumnName = correctionsColumnNameStem + answerNumber
columnAnswers[newCorrectionsColumnName] = columnAnswers[columnName]
columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(questionsCount, np.nan))
for question in columnAnswers[columnName].index:
#print()
#print(question)
__correctAnswers = source.loc[question]
if(len(__correctAnswers) > 0):
columnAnswers.loc[question,newCorrectionsColumnName] = False
for correctAnswer in __correctAnswers:
#print("-> " + correctAnswer)
if str(columnAnswers.loc[question,columnName])\
.startswith(str(correctAnswer)):
columnAnswers.loc[question,newCorrectionsColumnName] = True
break
else:
# user has never answered
print("can't give correct answers")
columnAnswers
In [ ]:
question = QAge
columnName = ''
for column in columnAnswers.columns:
if str.startswith(column, 'answers'):
columnName = column
break
In [ ]:
type(columnAnswers.loc[question,columnName])
In [ ]:
getCorrections(localplayerguid)
In [ ]:
gform.columns[20]
In [ ]:
columnAnswers.loc[gform.columns[20],columnAnswers.columns[1]]
In [ ]:
columnAnswers[columnAnswers.columns[1]][gform.columns[13]]
In [ ]:
columnAnswers.loc[gform.columns[13],columnAnswers.columns[1]]
In [ ]:
columnAnswers.iloc[20,1]
In [ ]:
questionsCount
In [ ]:
np.full(3, np.nan)
In [ ]:
pd.Series(np.full(questionsCount, np.nan))
In [ ]:
columnAnswers.loc[question,newCorrectionsColumnName]
In [ ]:
question
In [ ]:
correctAnswers[question]
In [ ]:
getCorrections('8d352896-a3f1-471c-8439-0f426df901c1')
In [ ]:
correctAnswersEN
#demographicAnswersEN
type([])
In [ ]:
mergedCorrectAnswersEN = correctAnswersEN.copy()
for index in mergedCorrectAnswersEN.index:
#print(str(mergedCorrectAnswersEN.loc[index,column]))
mergedCorrectAnswersEN.loc[index] =\
demographicAnswersEN.loc[index] + mergedCorrectAnswersEN.loc[index]
mergedCorrectAnswersEN
In [ ]:
correctAnswersEN + demographicAnswersEN
In [ ]:
correctAnswers + demographicAnswers
In [ ]:
corrections = getCorrections(userIdAnswersENFR)
#corrections
In [ ]:
for columnName in corrections.columns:
if correctionsColumnNameStem in columnName:
for index in corrections[columnName].index:
if(True==corrections.loc[index,columnName]):
corrections.loc[index,columnName] = 1
elif (False==corrections.loc[index,columnName]):
corrections.loc[index,columnName] = 0
corrections
In [ ]:
binarized = getBinarizedCorrections(corrections)
binarized
In [ ]:
slicedBinarized = binarized[13:40]
slicedBinarized
In [ ]:
slicedBinarized =\
binarized[13:40][binarized.columns[\
binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
]]
slicedBinarized
In [ ]:
_source = correctAnswers
_userId = getRandomGFormGUID()
getCorrections(_userId, _source=_source, _form = gform)
In [ ]:
_userId = '5e978fb3-316a-42ba-bb58-00856353838d'
gform[gform[localplayerguidkey] == _userId].iloc[0].index
In [ ]:
_gformLine = gform[gform[localplayerguidkey] == _userId].iloc[0]
_gformLine.loc['Before playing Hero.Coli, had you ever heard about synthetic biology?']
In [ ]:
_gformLine = gform[gform[localplayerguidkey] == _userId].iloc[0]
# only for one user
# def getBinarized(_gformLine, _source = correctAnswers):
_notEmptyIndexes = []
for _index in _source.index:
if(len(_source.loc[_index]) > 0):
_notEmptyIndexes.append(_index)
_binarized = pd.Series(np.full(len(_gformLine.index), np.nan), index = _gformLine.index)
for question in _gformLine.index:
_correctAnswers = _source.loc[question]
if(len(_correctAnswers) > 0):
_binarized[question] = 0
for _correctAnswer in _correctAnswers:
if str(_gformLine.loc[question])\
.startswith(str(_correctAnswer)):
_binarized.loc[question] = 1
break
_slicedBinarized = _binarized.loc[_notEmptyIndexes]
_slicedBinarized
In [ ]:
_slicedBinarized.loc['What are BioBricks and devices?']
In [ ]:
allBinarized = getAllBinarized()
In [ ]:
plotCorrelationMatrix(allBinarized)
In [ ]:
source
In [ ]:
source = correctAnswers + demographicAnswers
notEmptyIndexes = []
for eltIndex in source.index:
#print(eltIndex)
if(len(source.loc[eltIndex]) > 0):
notEmptyIndexes.append(eltIndex)
len(source)-len(notEmptyIndexes)
In [ ]:
emptyForm = gform[gform[localplayerguidkey] == 'incorrectGUID']
In [ ]:
emptyForm
In [ ]:
_source = correctAnswers + demographicAnswers
_form = gform #emptyForm
#def getAllBinarized(_source = correctAnswers, _form = gform ):
_notEmptyIndexes = []
for _index in _source.index:
if(len(_source.loc[_index]) > 0):
_notEmptyIndexes.append(_index)
_result = pd.DataFrame(index = _notEmptyIndexes)
for _userId in getAllResponders( _form = _form ):
_corrections = getCorrections(_userId, _source=_source, _form = _form)
_binarized = getBinarizedCorrections(_corrections)
_slicedBinarized =\
_binarized.loc[_notEmptyIndexes][_binarized.columns[\
_binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
]]
_result = pd.concat([_result, _slicedBinarized], axis=1)
_result = _result.T
#_result
In [ ]:
if(_result.shape[0] > 0 and _result.shape[1] > 0):
correlation = _result.astype(float).corr()
#plt.matshow(correlation)
sns.clustermap(correlation,cmap=plt.cm.jet,square=True,figsize=(10,10))
In [ ]:
#ax = sns.clustermap(correlation,cmap=plt.cm.jet,square=True,figsize=(10,10),cbar_kws={\
#"orientation":"vertical"})
In [ ]:
correlation_pearson = _result.T.astype(float).corr(methods[0])
correlation_kendall = _result.T.astype(float).corr(methods[1])
correlation_spearman = _result.T.astype(float).corr(methods[2])
print(correlation_pearson.equals(correlation_kendall))
print(correlation_kendall.equals(correlation_spearman))
diff = (correlation_pearson - correlation_kendall)
flattened = diff[diff > 0.1].values.flatten()
flattened[~np.isnan(flattened)]
In [ ]:
correlation
In [ ]:
scientificQuestionsLabels = gform.columns[13:40]
scientificQuestionsLabels = [
'In order to modify the abilities of the bacterium, you have to... #1',
'What are BioBricks and devices? #2',
'What is the name of this BioBrick? #3',
'What is the name of this BioBrick?.1 #4',
'What is the name of this BioBrick?.2 #5',
'What is the name of this BioBrick?.3 #6',
'What does this BioBrick do? #7',
'What does this BioBrick do?.1 #8',
'What does this BioBrick do?.2 #9',
'What does this BioBrick do?.3 #10',
'Pick the case where the BioBricks are well-ordered: #11',
'When does green fluorescence happen? #12',
'What happens when you unequip the movement device? #13',
'What is this? #14',
'What does this device do? #15',
'What does this device do?.1 #16',
'What does this device do?.2 #17',
'What does this device do?.3 #18',
'What does this device do?.4 #19',
'What does this device do?.5 #20',
'What does this device do?.6 #21',
'What does this device do?.7 #22',
'Guess: what would a device producing l-arabinose do, if it started with a l-arabinose-induced promoter? #23',
'Guess: the bacterium would glow yellow... #24',
'What is the species of the bacterium of the game? #25',
'What is the scientific name of the tails of the bacterium? #26',
'Find the antibiotic: #27',
]
scientificQuestionsLabelsX = [
'#1 In order to modify the abilities of the bacterium, you have to...',
'#2 What are BioBricks and devices?',
'#3 What is the name of this BioBrick?',
'#4 What is the name of this BioBrick?.1',
'#5 What is the name of this BioBrick?.2',
'#6 What is the name of this BioBrick?.3',
'#7 What does this BioBrick do?',
'#8 What does this BioBrick do?.1',
'#9 What does this BioBrick do?.2',
'#10 What does this BioBrick do?.3',
'#11 Pick the case where the BioBricks are well-ordered:',
'#12 When does green fluorescence happen?',
'#13 What happens when you unequip the movement device?',
'#14 What is this?',
'#15 What does this device do?',
'#16 What does this device do?.1',
'#17 What does this device do?.2',
'#18 What does this device do?.3',
'#19 What does this device do?.4',
'#20 What does this device do?.5',
'#21 What does this device do?.6',
'#22 What does this device do?.7',
'Guess: what would a device producing l-arabinose do, if it started with a l-arabinose-induced p#23 romoter?',
'#24 Guess: the bacterium would glow yellow...',
'#25 What is the species of the bacterium of the game?',
'#26 What is the scientific name of the tails of the bacterium?',
'#27 Find the antibiotic:',
]
In [ ]:
questionsLabels = scientificQuestionsLabels
questionsLabelsX = scientificQuestionsLabelsX
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.set_yticklabels(['']+questionsLabels)
ax.set_xticklabels(['']+questionsLabelsX, rotation='vertical')
ax.matshow(correlation)
ax.set_xticks(np.arange(-1,len(questionsLabels),1.));
ax.set_yticks(np.arange(-1,len(questionsLabels),1.));
In [ ]:
questionsLabels = correlation.columns.copy()
newLabels = []
for index in range(0, len(questionsLabels)):
newLabels.append(questionsLabels[index] + ' #' + str(index + 1))
correlationRenamed = correlation.copy()
correlationRenamed.columns = newLabels
correlationRenamed.index = newLabels
correlationRenamed
In [ ]:
correlationRenamed = correlation.copy()
correlationRenamed.columns = pd.Series(correlation.columns).apply(lambda x: x + ' #' + str(correlation.columns.get_loc(x) + 1))
correlationRenamed.index = correlationRenamed.columns
correlationRenamed
In [ ]:
correlation.shape
In [ ]:
fig = plt.figure(figsize=(10,10))
ax12 = plt.subplot(111)
ax12.set_title('Heatmap')
sns.heatmap(correlation,ax=ax12,cmap=plt.cm.jet,square=True)
In [ ]:
ax = sns.clustermap(correlation,cmap=plt.cm.jet,square=True,figsize=(10,10),cbar_kws={\
"orientation":"vertical"})
In [ ]:
questionsLabels = pd.Series(correlation.columns).apply(lambda x: x + ' #' + str(correlation.columns.get_loc(x) + 1))
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111)
cmap=plt.cm.jet
#cmap=plt.cm.ocean
cax = ax.imshow(correlation, interpolation='nearest', cmap=cmap,
# extent=(0.5,np.shape(correlation)[0]+0.5,0.5,np.shape(correlation)[1]+0.5)
)
#ax.grid(True)
plt.title('Questions\' Correlations')
ax.set_yticklabels(questionsLabels)
ax.set_xticklabels(questionsLabels, rotation='vertical')
ax.set_xticks(np.arange(len(questionsLabels)));
ax.set_yticks(np.arange(len(questionsLabels)));
#ax.set_xticks(np.arange(-1,len(questionsLabels),1.));
#ax.set_yticks(np.arange(-1,len(questionsLabels),1.));
fig.colorbar(cax)
plt.show()
In [ ]:
ax.get_xticks()
In [ ]:
transposed = _result.T.astype(float)
transposed.head()
In [ ]:
transposed.corr()
In [ ]:
transposed.columns = range(0,len(transposed.columns))
transposed.index = range(0,len(transposed.index))
transposed.head()
In [ ]:
transposed = transposed.iloc[0:10,0:3]
transposed
In [ ]:
transposed = transposed.astype(float)
In [ ]:
type(transposed[0][0])
In [ ]:
transposed.columns = list('ABC')
transposed
In [ ]:
transposed.loc[0, 'A'] = 0
transposed
In [ ]:
transposed.corr()
data = transposed[[0,1]] data.corr(method = 'spearman')
In [ ]:
round(7.64684)
In [ ]:
df = pd.DataFrame(10*np.random.randint(2, size=[20,2]),index=range(0,20),columns=list('AB'))
#df.columns = range(0,len(df.columns))
df.head()
#type(df[0][0])
In [ ]:
type(df.columns)
In [ ]:
df.corr()
In [ ]:
#corr = pd.Series({}, index = methods)
for meth in methods:
#corr[meth] = result.corr(method = meth)
print(meth + ":\n" + str(transposed.corr(method = meth)) + "\n\n")
In [ ]:
In [ ]:
befores = gform.copy()
befores = befores[befores[QTemporality] == answerTemporalities[0]]
print(len(befores))
allBeforesBinarized = getAllBinarized( _source = correctAnswers + demographicAnswers, _form = befores)
In [ ]:
np.unique(allBeforesBinarized.values.flatten())
In [ ]:
allBeforesBinarized.columns[20]
In [ ]:
allBeforesBinarized.T.dot(allBeforesBinarized)
In [ ]:
np.unique(allBeforesBinarized.iloc[:,20].values)
In [ ]:
plotCorrelationMatrix( allBeforesBinarized, _abs=False,\
_clustered=False, _questionNumbers=True )
In [ ]:
_correlation = allBeforesBinarized.astype(float).corr()
overlay = allBeforesBinarized.T.dot(allBeforesBinarized).astype(int)
_correlation.columns = pd.Series(_correlation.columns).apply(\
lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
_correlation.index = _correlation.columns
_correlation = _correlation.abs()
_fig = plt.figure(figsize=(20,20))
_ax = plt.subplot(111)
#sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=overlay,fmt='d')
sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=True)
In [ ]:
afters = gform.copy()
afters = afters[afters[QTemporality] == answerTemporalities[1]]
print(len(afters))
allAftersBinarized = getAllBinarized( _source = correctAnswers + demographicAnswers, _form = afters)
In [ ]:
np.unique(allAftersBinarized.values.flatten())
In [ ]:
plotCorrelationMatrix( allAftersBinarized, _abs=False,\
_clustered=False, _questionNumbers=True )
In [ ]:
#for answerIndex in range(0,len(allAftersBinarized)):
# print(str(answerIndex) + " " + str(allAftersBinarized.iloc[answerIndex,0]))
In [ ]:
allAftersBinarized.iloc[28,0]
In [ ]:
len(allAftersBinarized)
In [ ]:
len(allAftersBinarized.index)
In [ ]:
_correlation = allAftersBinarized.astype(float).corr()
overlay = allAftersBinarized.T.dot(allAftersBinarized).astype(int)
_correlation.columns = pd.Series(_correlation.columns).apply(\
lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
_correlation.index = _correlation.columns
_fig = plt.figure(figsize=(10,10))
_ax = plt.subplot(111)
#sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True,annot=overlay,fmt='d')
sns.heatmap(_correlation,ax=_ax,cmap=plt.cm.jet,square=True)
In [ ]:
crossCorrect = getCrossCorrectAnswers(allAftersBinarized)
In [ ]:
pd.Series((overlay == crossCorrect).values.flatten()).unique()
In [ ]:
allAftersBinarized.shape
In [ ]:
cross = allAftersBinarized.T.dot(allAftersBinarized)
cross.shape
In [ ]:
equal = (cross == crossCorrect)
type(equal)
In [ ]:
pd.Series(equal.values.flatten()).unique()
In [ ]:
testUser = userIdAnswersFR
In [ ]:
gform[gform[localplayerguidkey] == testUser].T
In [ ]:
getScore(testUser)
In [ ]:
print("draft test")
testUserId = "3ef14300-4987-4b54-a56c-5b6d1f8a24a1"
testUserId = userIdAnswersEN
#def getScore( _userId, _form = gform ):
score = pd.DataFrame({}, columns = answerTemporalities)
score.loc['score',:] = np.nan
for column in score.columns:
score.loc['score', column] = []
if hasAnswered( testUserId ):
columnAnswers = getCorrections(testUserId)
for columnName in columnAnswers.columns:
# only work on corrected columns
if correctionsColumnNameStem in columnName:
answerColumnName = columnName.replace(correctionsColumnNameStem,\
answersColumnNameStem)
temporality = columnAnswers.loc[QTemporality,answerColumnName]
counts = (columnAnswers[columnName]).value_counts()
thisScore = 0
if(True in counts):
thisScore = counts[True]
score.loc['score',temporality].append(thisScore)
else:
print("user " + str(testUserId) + " has never answered")
#expectedScore = 18
#if (expectedScore != score[0]):
# print("ERROR incorrect score: expected "+ str(expectedScore) +", got "+ str(score))
score
In [ ]:
score = pd.DataFrame({}, columns = answerTemporalities)
score.loc['score',:] = np.nan
for column in score.columns:
score.loc['score', column] = []
score
In [ ]:
#score.loc['user0',:] = [1,2,3]
In [ ]:
#score
In [ ]:
#type(score)
In [ ]:
#type(score[0])
In [ ]:
#for i,v in score[0].iteritems():
# print(v)
In [ ]:
#score[0][answerTemporalities[2]]
In [ ]:
#columnAnswers.loc[QTemporality,'answers0']
In [ ]:
False in (columnAnswers[columnName]).value_counts()
In [ ]:
getScore("3ef14300-4987-4b54-a56c-5b6d1f8a24a1")
In [ ]:
#gform[gform[localplayerguidkey]=="3ef14300-4987-4b54-a56c-5b6d1f8a24a1"].T
In [ ]:
correctAnswers
Theoretically, they should match. Whoever understood an item should beat the matching challenge. The discrepancies are due to game design or level design.
In [ ]:
#questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(35))
questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))
questionnaireValidatedCheckpointsPerQuestion.head()
In [ ]:
checkpointQuestionMatching['checkpoint'][19]
In [ ]:
userId = localplayerguid
_form = gform
#function that returns the list of checkpoints from user id
#def getValidatedCheckpoints( userId, _form = gform ):
_validatedCheckpoints = []
if hasAnswered( userId, _form = _form ):
_columnAnswers = getCorrections( userId, _form = _form)
for _columnName in _columnAnswers.columns:
# only work on corrected columns
if correctionsColumnNameStem in _columnName:
_questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))
for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
if _columnAnswers[_columnName][_index]==True:
_questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
else:
_questionnaireValidatedCheckpointsPerQuestion[_index] = ''
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
_questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
_questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
_questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
_validatedCheckpoints.append(_questionnaireValidatedCheckpoints)
else:
print("user " + str(userId) + " has never answered")
result = pd.Series(data=_validatedCheckpoints)
In [ ]:
result
In [ ]:
type(result[0])
In [ ]:
testSeries1 = pd.Series(
[
'tutorial1.Checkpoint00',
'tutorial1.Checkpoint01',
'tutorial1.Checkpoint02',
'tutorial1.Checkpoint05'
]
)
testSeries2 = pd.Series(
[
'tutorial1.Checkpoint01',
'tutorial1.Checkpoint05'
]
)
np.setdiff1d(testSeries1, testSeries2)
np.setdiff1d(testSeries1.values, testSeries2.values)
In [ ]:
getAnswers(localplayerguid).head(2)
In [ ]:
getCorrections(localplayerguid).head(2)
In [ ]:
getScore(localplayerguid)
In [ ]:
getValidatedCheckpoints(localplayerguid)
In [ ]:
getNonValidatedCheckpoints(localplayerguid)
In [ ]:
qPlayedHerocoliIndex = 10
qPlayedHerocoliYes = ['Yes', 'Once', 'Multiple times', 'Oui',
'De nombreuses fois', 'Quelques fois', 'Une fois']
questionIndex = qPlayedHerocoliIndex
choice = qPlayedHerocoliYes
_form = gform
# returns all rows of Google form's answers that contain an element
# of the array 'choice' for question number 'questionIndex'
#def getAllAnswerRows(questionIndex, choice, _form = gform ):
_form[_form.iloc[:, questionIndex].isin(choice)]
In [ ]:
_df = getAllAnswerRows(qPlayedHerocoliIndex, qPlayedHerocoliYes, _form = gform )
#def getPercentCorrectPerColumn(_df):
_count = len(_df)
_percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
for _rowIndex in _df.index:
for _columnName in _df.columns:
_columnIndex = _df.columns.get_loc(_columnName)
if ((_columnIndex >= firstEvaluationQuestionIndex) \
and (_columnIndex < len(_df.columns)-3)):
if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
if (np.isnan(_percents[_columnName])):
_percents[_columnName] = 1;
else:
_percents[_columnName] = _percents[_columnName]+1
else:
if (np.isnan(_percents[_columnName])):
_percents[_columnName] = 0;
_percents = _percents/_count
_percents['Count'] = _count
_percents
print('\n\n\npercents=\n' + str(_percents))
In [ ]:
questionIndex = qPlayedHerocoliIndex
choice = qPlayedHerocoliYes
_form = gform
#def getPercentCorrectKnowingAnswer(questionIndex, choice, _form = gform):
_answerRows = getAllAnswerRows(questionIndex, choice, _form = _form);
getPercentCorrectPerColumn(_answerRows)
In [ ]:
#localplayerguid = '8d352896-a3f1-471c-8439-0f426df901c1'
#localplayerguid = '7037c5b2-c286-498e-9784-9a061c778609'
#localplayerguid = '5c4939b5-425b-4d19-b5d2-0384a515539e'
#localplayerguid = '7825d421-d668-4481-898a-46b51efe40f0'
#localplayerguid = 'acb9c989-b4a6-4c4d-81cc-6b5783ec71d8'
for id in getAllResponders():
print("===========================================")
print("id=" + str(id))
print("-------------------------------------------")
print(getAnswers(id).head(2))
print("-------------------------------------------")
print(getCorrections(id).head(2))
print("-------------------------------------------")
print("scores=" + str(getScore(id)))
print("#ValidatedCheckpoints=" + str(getValidatedCheckpointsCounts(id)))
print("#NonValidatedCheckpoints=" + str(getNonValidatedCheckpointsCounts(id)))
print("===========================================")
In [ ]:
gform[localplayerguidkey]
hasAnswered( '8d352896-a3f1-471c-8439-0f426df901c1' )
'8d352896-a3f1-471c-8439-0f426df901c1' in gform[localplayerguidkey].values
In [ ]:
apostropheTestString = 'it\'s a test'
apostropheTestString
In [ ]:
In [ ]:
#gformEN.head(2)
In [ ]:
#gformFR.head(2)
In [ ]:
#gformEN[QLanguage] = pd.Series(enLanguageID, index=gformEN.index)
#gformFR[QLanguage] = pd.Series(frLanguageID, index=gformFR.index)
In [ ]:
#gformFR.head(2)
In [ ]:
# rename columns
#gformFR.columns = gformEN.columns
#gformFR.head(2)
In [ ]:
#gformTestMerge = pd.concat([gformEN, gformFR])
In [ ]:
#gformTestMerge.head(2)
In [ ]:
#gformTestMerge.tail(2)
In [ ]:
gform
In [ ]:
localplayerguid
In [ ]:
someAnswers = getAnswers( '8ca16c7a-70a6-4723-bd72-65b8485a2e86' )
someAnswers
In [ ]:
testQuestionIndex = 24
In [ ]:
thisUsersFirstEvaluationQuestion = str(someAnswers[someAnswers.columns[0]][testQuestionIndex])
thisUsersFirstEvaluationQuestion
In [ ]:
someAnswers[someAnswers.columns[0]][QLanguage]
In [ ]:
firstEvaluationQuestionCorrectAnswer = str(correctAnswers[testQuestionIndex])
firstEvaluationQuestionCorrectAnswer
In [ ]:
thisUsersFirstEvaluationQuestion.startswith(firstEvaluationQuestionCorrectAnswer)
In [ ]:
answerDate = gform[gform['userId'] == '51f1ef77-ec48-4976-be1f-89b7cbd1afab'][QTimestamp][0]
answerDate
In [ ]:
allEvents = rmdf1522[rmdf1522['userId']=='51f1ef77-ec48-4976-be1f-89b7cbd1afab']
allEventsCount = len(allEvents)
eventsBeforeRatio = len(allEvents[allEvents['userTime'] > answerDate])/allEventsCount
eventsAfterRatio = len(allEvents[allEvents['userTime'] < answerDate])/allEventsCount
result = [eventsBeforeRatio, eventsAfterRatio]
result
In [ ]:
len(gform)
len(gform[gform[QTemporality] == answerTemporalities[2]])
len(gform[gform[QTemporality] == answerTemporalities[0]])
len(gform[gform[QTemporality] == answerTemporalities[1]])
gform.loc[:, [QPlayed, 'userId', QTemporality, QTimestamp]].sort_values(by = ['userId', QTimestamp])
gform.loc[:, [QPlayed, 'userId', QTemporality, QTimestamp]].sort_values(by = ['userId', QTimestamp])
sortedGFs = gform.loc[:, [QPlayed, 'userId', QTemporality, QTimestamp]].sort_values(by = ['userId', QTimestamp])
sortedGFs[sortedGFs[QTemporality] == answerTemporalities[2]]
result = pd.DataFrame()
maxuserIdIndex = len(sortedGFs['userId'])
userIdIndex = 0
userIdIntProgress = IntProgress(
value=0,
min=0,
max=maxuserIdIndex,
description='userIdIndex:'
)
display(userIdIntProgress)
userIdText = Text('')
display(userIdText)
for userid in sortedGFs['userId']:
userIdIndex += 1
userIdIntProgress.value = userIdIndex
userIdText.value = userid
if (len(sortedGFs[sortedGFs['userId'] == userid]) >= 2) and (answerTemporalities[2] in sortedGFs[sortedGFs['userId'] == userid][QTemporality].values):
if len(result) == 0:
result = sortedGFs[sortedGFs['userId'] == userid]
else:
result = pd.concat([result, sortedGFs[sortedGFs['userId'] == userid]])
#print(sortedGFs[sortedGFs['userId'] == userid])
result
len(gform) - len(result)
len(gform[gform[QTemporality] == answerTemporalities[2]])
len(gform[gform[QTemporality] == answerTemporalities[0]])
len(gform[gform[QTemporality] == answerTemporalities[1]])
gform.loc[:, [QPlayed, 'userId', QTemporality, QTimestamp]].sort_values(by = ['userId', QTimestamp])
In [ ]:
rmdf1522['userTime'].min(),gform[QTimestamp].min(),rmdf1522['userTime'].min().floor('d') == gform[QTimestamp].min().floor('d')
In [ ]:
# code to find special userIds
enSpeakers = gform[gform[QLanguage]==enLanguageID]
frSpeakers = gform[gform[QLanguage]==frLanguageID]
sortedGFs = gform.loc[:, ['userId', QTemporality, QTimestamp, QLanguage]].sort_values(by = ['userId', QTimestamp])
foundUserIDThatDidNotAnswer = False
foundUserID1AnswerEN = False
foundUserIDAnswersEN = False
foundUserID1ScoreEN = False
foundUserIDScoresEN = False
foundUserID1AnswerFR = False
foundUserIDAnswersFR = False
foundUserID1ScoreFR = False
foundUserIDScoresFR = False
foundUserIDAnswersENFR = False
maxuserIdIndex = len(sortedGFs['userId'])
userIdIndex = 0
userIdIntProgress = IntProgress(
value=0,
min=0,
max=maxuserIdIndex,
description='userIdIndex:'
)
display(userIdIntProgress)
userIdText = Text('')
display(userIdText)
# survey1522startDate = Timestamp('2018-03-24 12:00:00.000000+0000', tz='UTC')
survey1522startDate = gform[QTimestamp].min().floor('d')
if (rmdf1522['userTime'].min().floor('d') != gform[QTimestamp].min().floor('d')):
print("rmdf and gform first date don't match")
for userId in rmdf1522[rmdf1522['userTime'] >= survey1522startDate]['userId']:
if userId not in sortedGFs['userId'].values:
print("userIdThatDidNotAnswer = '" + userId + "'")
foundUserIDThatDidNotAnswer = True
break
for userId in sortedGFs['userId']:
userIdIndex += 1
userIdIntProgress.value = userIdIndex
userIdText.value = userId
answers = sortedGFs[sortedGFs['userId'] == userId]
if not foundUserID1AnswerEN and (len(answers) == 1) and (answers[QLanguage].unique() == [enLanguageID]):
print("userId1AnswerEN = '" + userId + "'")
print("userId1ScoreEN = '" + userId + "'")
foundUserID1AnswerEN = True
foundUserID1ScoreEN = True
if not foundUserIDAnswersEN and (len(answers) >= 2) and (answers[QLanguage].unique() == [enLanguageID]):
print("userIdAnswersEN = '" + userId + "'")
print("userIdScoresEN = '" + userId + "'")
foundUserIDAnswersEN = True
foundUserIDScoresEN = True
# if not foundUserID1ScoreEN and :
# print("userId1ScoreEN = '" + userId + "'")
# foundUserID1ScoreEN = True
# if not foundUserIDScoresEN and :
# print("userIdScoresEN = '" + userId + "'")
# foundUserIDScoresEN = True
if not foundUserID1AnswerFR and (len(answers) == 1) and (answers[QLanguage].unique() == [frLanguageID]):
print("userId1AnswerFR = '" + userId + "'")
print("userId1ScoreFR = '" + userId + "'")
foundUserID1AnswerFR = True
foundUserID1ScoreFR = True
if not foundUserIDAnswersFR and (len(answers) >= 2) and (answers[QLanguage].unique() == [frLanguageID]):
print("userIdAnswersFR = '" + userId + "'")
print("userIdScoresFR = '" + userId + "'")
foundUserIDAnswersFR = True
foundUserIDScoresFR = True
# if not foundUserID1ScoreFR and :
# print("userId1ScoreFR = '" + userId + "'")
# foundUserID1ScoreFR = True
# if not foundUserIDScoresFR and :
# print("userIdScoresFR = '" + userId + "'")
# foundUserIDScoresFR = True
if not foundUserIDAnswersENFR and (len(answers) >= 2) and (enLanguageID in answers[QLanguage].unique()) and (frLanguageID in answers[QLanguage].unique()):
print("userIdAnswersENFR = '" + userId + "'")
foundUserIDAnswersENFR = True
In [ ]:
answers
In [ ]:
answerDate = gform[gform['userId'] == '51f1ef77-ec48-4976-be1f-89b7cbd1afab'][QTimestamp][0]
answerDate
getEventCountRatios(answerDate, '51f1ef77-ec48-4976-be1f-89b7cbd1afab')
allEvents = rmdf1522[rmdf1522['userId']=='51f1ef77-ec48-4976-be1f-89b7cbd1afab']
allEventsCount = len(allEvents)
eventsBeforeRatio = len(allEvents[allEvents['userTime'] < answerDate])/allEventsCount
eventsAfterRatio = len(allEvents[allEvents['userTime'] > answerDate])/allEventsCount
result = [eventsBeforeRatio, eventsAfterRatio]
result
[answerDate, allEvents.loc[:, ['userTime']].iloc[0], allEvents.loc[:, ['userTime']].iloc[-1]]
gform[gform['userId'] == '51f1ef77-ec48-4976-be1f-89b7cbd1afab'][QTemporality].iloc[0]
userId = '51f1ef77-ec48-4976-be1f-89b7cbd1afab'
answerDate = gform[gform['userId'] == userId][QTimestamp][0]
[eventsBeforeRatio, eventsAfterRatio] = getEventCountRatios(answerDate, userId)
[eventsBeforeRatio, eventsAfterRatio]
In [ ]:
In [ ]:
In [ ]:
# code to find currently-sorted-as-posttest answers that have nan answers to content questions
QQ = QBioBricksDevicesComposition
for answerIndex in gform.index:
if gform.loc[answerIndex, QTemporality] == answerTemporalities[1]:
if pd.isnull(gform.loc[answerIndex,QQ]):
print(answerIndex)
In [ ]:
# code to find which answers have both already played but also filled in profile questions
answersPlayedButProfile = []
for answerIndex in gform.index:
if gform.loc[answerIndex, QTemporality] == answerTemporalities[1]:
if ~pd.isnull(gform.iloc[answerIndex, QAge]):
answersPlayedButProfile.append(answerIndex)
gform.loc[answersPlayedButProfile, QPlayed]
In [ ]:
userId = gform.loc[54, 'userId']
thisUserIdsAnswers = gform[gform['userId'] == userId]
thisUserIdsAnswers[thisUserIdsAnswers[QTemporality] == answerTemporalities[0]][QAge].values[0]
In [ ]:
gform[gform[QTemporality] == answerTemporalities[0]][QAge].unique()
In [ ]:
# pretest ages
ages = gform[(gform[QTemporality] == answerTemporalities[0])][QAge].unique()
ages.sort()
ages
In [ ]:
# the answers that are a problem for the analysis
AUnclassifiable = 'I played recently on an other computer'
#_gformDF[(_gformDF[QTemporality] == answerTemporalities[1]) & (_gformDF[QAge].apply(type) == str)]
gform[gform[QPlayed] == AUnclassifiable]
In [ ]:
# various tests around setPosttestsProfileInfo
len(_gformDF[pd.isnull(_gformDF[QAge])])/len(_gformDF)
_gformDF[pd.isnull(_gformDF[QAge])][QTemporality].unique()
_gformDF[_gformDF[QTemporality] == answerTemporalities[1]][QAge].unique()
nullAge = _gformDF[pd.isnull(_gformDF[QAge])]['userId']
nullAge = _gformDF[_gformDF['userId'].isin(nullAge)]
len(nullAge)
nullAge.sort_values(QPlayed)
dates = np.unique(nullAge[QTimestamp].apply(pd.Timestamp.date).values)
dates.sort()
dates
nullAge[QTimestamp].apply(pd.Timestamp.date).value_counts().sort_index()
In [ ]:
len(nullAge['userId'].unique())/len(gform['userId'].unique())
pretestIds = _gformDF[_gformDF[QTemporality] == answerTemporalities[0]]['userId']
posttestIds = _gformDF[_gformDF[QTemporality] == answerTemporalities[1]]['userId']
posttestsWithoutPretests = posttestIds[~posttestIds.isin(pretestIds)]
pretestsWithoutPosttests = pretestIds[~pretestIds.isin(posttestIds)]
len(posttestsWithoutPretests), len(posttestIds), len(pretestsWithoutPosttests), len(pretestIds)
intersectionIds1 = pretestIds[pretestIds.isin(posttestIds)]
intersectionIds2 = posttestIds[posttestIds.isin(pretestIds)]
_gformDF.loc[intersectionIds2.index]
len(gform) - len(getWithoutIncompleteAnswers())
_gformDF2.iloc[_gformDF2.index[pd.isnull(_gformDF2[_gformDF2.columns[survey1522DF[profileColumn]]].T).any()]]
withoutIncompleteAnswers = getWithoutIncompleteAnswers()
len(gform) - len(withoutIncompleteAnswers)
len(getWithoutIncompleteAnswers())
In [ ]:
# tests for getPerfectPretestPostestPairs
'29b739fc-4f9f-4f5e-bfee-8ba12de4b7fa' in testUsers
_gformDF3 = getWithoutIncompleteAnswers(gform)
sortedPosttests = _gformDF3[_gformDF3[QTemporality] == answerTemporalities[1]]['userId'].value_counts()
posttestDuplicatesUserIds = sortedPosttests[sortedPosttests > 1].index
_gformDF4 = _gformDF3[_gformDF3['userId'].isin(posttestDuplicatesUserIds)].drop_duplicates(subset=['userId', QTemporality], keep='first')
_gformDF5 = _gformDF3.sort_values(['userId', QTimestamp]).drop_duplicates(subset=['userId', QTemporality], keep='first')
len(gform),len(_gformDF3),len(_gformDF4),len(_gformDF5)
gform[gform['userId'].isin(posttestDuplicatesUserIds)][[QTimestamp, 'userId', QTemporality]].sort_values(['userId', QTimestamp])
gform.iloc[getPosttestsWithoutPretests(gform)][[QTimestamp, 'userId', QTemporality]].sort_values(['userId', QTimestamp])
In [ ]:
# tests for getPerfectPretestPostestPairs
_gformDF = gform
_gformDF2 = getWithoutIncompleteAnswers(_gformDF)
vc = _gformDF2['userId'].value_counts()
vc[vc == 1]
# remove ulterior pretests and posttests
_gformDF3 = _gformDF2.sort_values(['userId', QTimestamp]).drop_duplicates(subset=['userId', QTemporality], keep='first')
vc = _gformDF3['userId'].value_counts()
vc[vc == 1]
# only keep pretests that have matching posttests
posttestIds = _gformDF3[_gformDF3[QTemporality] == answerTemporalities[1]]['userId']
_gformDF4 = _gformDF3.drop(_gformDF3.index[~_gformDF3['userId'].isin(posttestIds)])
vc = _gformDF4['userId'].value_counts()
vc[vc == 1]
vc